{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10 ['Jim', 'is', 'bringing', 'his', 'bulldog', 'to', 'eat', 'at', 'Friendlys', '?']\n"
     ]
    }
   ],
   "source": [
    "from nltk.tokenize import word_tokenize\n",
    "sentence = \"Jim is bringing his bulldog to eat at Friendlys?\"\n",
    "tokens = word_tokenize(sentence)\n",
    "print (len(tokens), tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "6 ['Jim', 'bringing', 'bulldog', 'eat', 'Friendlys', '?']\n"
     ]
    }
   ],
   "source": [
    "from nltk.corpus import stopwords\n",
    "stop_words = set(stopwords.words('english'))\n",
    "tokens = [w for w in tokens if not w in stop_words]\n",
    "print (len(tokens), tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "jim\n",
      "bring\n",
      "bulldog\n",
      "eat\n",
      "friend\n",
      "?\n"
     ]
    }
   ],
   "source": [
    "from nltk.stem import LancasterStemmer\n",
    "ps = LancasterStemmer()\n",
    "for t in tokens:\n",
    "    print(ps.stem(t))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('Jim', 'NNP'), ('bringing', 'VBG'), ('bulldog', 'JJ'), ('eat', 'NN'), ('Friendlys', 'NNP'), ('?', '.')] Nouns: [('eat', 'NN')]\n"
     ]
    }
   ],
   "source": [
    "import nltk\n",
    "# the off-the-shelf tagger still uses the Penn Treebank tagset\n",
    "# tags: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html\n",
    "tags = nltk.pos_tag(tokens)\n",
    "print (tags, \"Nouns:\", [t for t in tags if t[1]=='NN'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('Jim', 'NNP', 'B-PERSON'), ('is', 'VBZ', 'O'), ('bringing', 'VBG', 'O'), ('his', 'PRP$', 'O'), ('bulldog', 'NN', 'O'), ('to', 'TO', 'O'), ('eat', 'VB', 'O'), ('at', 'IN', 'O'), ('Friendlys', 'NNP', 'B-ORGANIZATION'), ('?', '.', 'O')]\n",
      "[('Friendlys', 'NNP', 'B-ORGANIZATION')]\n"
     ]
    }
   ],
   "source": [
    "# nltk.download('maxent_ne_chunker')\n",
    "# nltk.download('words')\n",
    "from nltk import word_tokenize, pos_tag, ne_chunk\n",
    "from nltk.chunk import conlltags2tree, tree2conlltags\n",
    "\n",
    "# chunk the sentence\n",
    "ne_tree = ne_chunk(pos_tag(word_tokenize(sentence)))\n",
    " \n",
    "# IOB transform\n",
    "# B-{CHUNK_TYPE} – for the word in the Beginning chunk\n",
    "# I-{CHUNK_TYPE} – for words Inside the chunk\n",
    "# O – Outside any chunk\n",
    "\n",
    "iob_tagged = tree2conlltags(ne_tree)\n",
    "print (iob_tagged)\n",
    "print ([i for i in iob_tagged if i[2]=='B-ORGANIZATION'])\n",
    "# http://streamhacker.com/2008/12/29/how-to-train-a-nltk-chunker/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "('bulldog', 'NN', 'O')\n",
      "a sturdy thickset short-haired breed with a large head and strong undershot lower jaw; developed originally in England for bull baiting\n"
     ]
    }
   ],
   "source": [
    "noun = [i for i in iob_tagged if i[1]=='NN' ][0]\n",
    "print (noun)\n",
    "from nltk.corpus import wordnet\n",
    "syns = wordnet.synsets(noun[0])\n",
    "print(syns[0].definition())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.8387096774193549\n",
      "0.36363636363636365\n",
      "0.13333333333333333\n"
     ]
    }
   ],
   "source": [
    "w1 = wordnet.synset('bulldog.n.01')\n",
    "w2 = wordnet.synset('poodle.n.01')\n",
    "print(w1.wup_similarity(w2))\n",
    "\n",
    "w2 = wordnet.synset('car.n.01')\n",
    "print(w1.wup_similarity(w2))\n",
    "\n",
    "w2 = wordnet.synset('space.n.01')\n",
    "print(w1.wup_similarity(w2))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'bulldog', 'English_bulldog'}\n",
      "set()\n",
      "{'productive', 'deep', 'racy', 'copious', 'ample', 'fat', 'robust', 'plenteous', 'plentiful', 'fertile', 'rich_people', 'full-bodied', 'rich'}\n",
      "{'poor', 'poor_people', 'lean'}\n"
     ]
    }
   ],
   "source": [
    "def syn_ant(word):\n",
    "    synonyms = []\n",
    "    antonyms = []\n",
    "    for syn in wordnet.synsets(word):\n",
    "        for l in syn.lemmas():\n",
    "            synonyms.append(l.name())\n",
    "            if l.antonyms():\n",
    "                antonyms.append(l.antonyms()[0].name())\n",
    "    return synonyms, antonyms\n",
    "\n",
    "s, a = syn_ant(noun[0])\n",
    "print(set(s))\n",
    "print(set(a))\n",
    "s, a = syn_ant('rich')\n",
    "print(set(s))\n",
    "print(set(a))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
